Story 6#

Andrew Bowen DATA 621 CUNY Data Science Masters

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import plotly.express as px
from urllib.request import urlopen
import plotly.figure_factory as ff


import shapely
import warnings
from shapely.errors import ShapelyDeprecationWarning
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning)

Data Sources#

I found US Food Security data published by the US Census Bureau here

# Original read-in, posted on GitHub for quicker read-in
# dat = pd.read_csv("https://www2.census.gov/programs-surveys/cps/datasets/2022/supp/dec22pub.csv")
dat = pd.read_parquet("https://github.com/andrewbowen19/storiesDATA608/blob/main/data/census-food-security.parquet?raw=true")

dat.head()
HRHHID HRMONTH HRYEAR4 HURESPLI HUFINAL FILLER HETENURE HEHOUSUT HETELHHD HETELAVL ... HRFS30D1 HRFS30D2 HRFS30D3 HRFS30D4 HRFS30D5 HRFS30D6 HRFS30D7 HRFS30D8 HRFS30D9 HRFS30DE
0 351819007700950 12 2022 -1 225 NaN -1 1 -1 -1 ... -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
1 143200010045972 12 2022 -1 226 NaN -1 1 -1 -1 ... -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
2 201348501120003 12 2022 -1 225 NaN 1 5 -1 -1 ... -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
3 610009070389621 12 2022 -1 226 NaN -1 1 -1 -1 ... -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
4 110862907936 12 2022 -1 226 NaN -1 1 -1 -1 ... -1 -1 -1 -1 -1 -1 -1 -1 -1 -1

5 rows × 507 columns

dat.to_parquet("data/census-food-security.parquet")

Data Wrangling#

We’ll need to do a significant amount of column renaming. Using this data dictionary

Children’s Food Security Scale variables are coded as “Not in Universe” (-1) if there were no children in the household.

# Rename dataframe columns
column_renames = {
    "HRHHID": "household_id",
    "PESEX": "sex",
    "HRFS12M1": "food_security_level",
    "HRFS12M6": "raw_child_food_security_score",
    "HRFS12MC": "child_food_security_level",
    "HRFS12M8": "adult_food_security_level"
    }

df = dat.rename(columns=column_renames)
df = df[list(column_renames.values())]
df.head()
household_id sex food_security_level raw_child_food_security_score child_food_security_level adult_food_security_level
0 351819007700950 -1 -1 -1 -1 -1
1 143200010045972 -1 -1 -1 -1 -1
2 201348501120003 -1 -1 -1 -1 -1
3 610009070389621 -1 -1 -1 -1 -1
4 110862907936 -1 -1 -1 -1 -1
df.child_food_security_level.unique()
array([-1,  1,  2,  3, -9])
child_level_map =  {3: "Food Secure", 2: "Low Food Security", 1: "Very Low Food Security", -9: "Not in Universe", -1: "No Response"}

adult_level_map = {4: "High", 3: "Marginal", 2: "Low", 1: "Very Low", -9: "Not in Universe", -1: "No Response"}

# Map values
df['child_food_security_level'] = df.child_food_security_level.map(child_level_map)
df['adult_food_security_level'] = df.adult_food_security_level.map(adult_level_map)

Data Visualization#

child_food_security = df.loc[(df.child_food_security_level!="No Response") &
                             (df.child_food_security_level!="Not in Universe")].groupby('child_food_security_level')['child_food_security_level'].count()
child_food_security
child_food_security_level
Food Secure                 335
Low Food Security          2503
Very Low Food Security    30527
Name: child_food_security_level, dtype: int64
# Plot child food security levels
f, ax = plt.subplots(figsize=(12,10))

ax.bar(child_food_security.index, child_food_security.values)
ax.set_xlabel("Food Security Level - USDA")
ax.set_ylabel("Number of Children")

ax.set_title("The majority of survey respondents with children did not meet the USDA's definition of food security")
f.suptitle("Childhood Food Security - USDA 2022", fontsize=18)
Text(0.5, 0.98, 'Childhood Food Security - USDA 2022')
_images/8a3433386728b5f420bc05704facf11138e4cdb3ffce11b3d273e039995c39ed.png
adult_food_security = df.loc[(df.adult_food_security_level!="No Response") &
                             (df.adult_food_security_level!="Not in Universe")].groupby('adult_food_security_level')['adult_food_security_level'].count()
adult_food_security
adult_food_security_level
High         3474
Low          6845
Marginal     5217
Very Low    59280
Name: adult_food_security_level, dtype: int64
# Plot adult food security levels
f, ax = plt.subplots(figsize=(12,10))

ax.bar(adult_food_security.index, adult_food_security.values)
ax.set_xlabel("Food Security Level - USDA")
ax.set_ylabel("Number of Adults")

ax.set_title("The majority of adult survey respondents did not meet the USDA's definition of food security")
f.suptitle("Adult Food Security - USDA 2022", fontsize=18)
Text(0.5, 0.98, 'Adult Food Security - USDA 2022')
_images/d43c85b5a9fdb24e3b0de3fc3bf04fc55cc57cacb47c60618b8c9c86abd4db6d.png

The excel file included is posted on the US Department of Agriculture website here. This comes from their Food Atlas initiative to give an overview of food access within the United States. I’ve converted the file to parquet for easier read-in times.

# Data Dictionary for USDA Food Atlas data
# dd = pd.read_excel("/Users/andrewbowen/Downloads/FoodAccessResearchAtlasData2019.xlsx", sheet_name="Variable Lookup")

# dd.to_csv("data/food-atlass-dd.csv", index=False)
dd = pd.read_csv("https://raw.githubusercontent.com/andrewbowen19/storiesDATA608/main/data/food-atlass-dd.csv")
# Initial read in, converting to parquet format for easier storage
# Uncomment the block below and move the excel file linked into your local data folder

# food_locs = pd.read_excel("/Users/andrewbowen/Downloads/FoodAccessResearchAtlasData2019.xlsx", sheet_name="Food Access Research Atlas")
# food_locs.to_parquet("./data/food_access.parquet", index=False)
# food_locs.head()

I uploaded the dataset above as parquet files to my GitHub here

# Read in food location data
food_data_url = "https://github.com/andrewbowen19/storiesDATA608/blob/main/data/food_access.parquet?raw=true"
food_locs = pd.read_parquet(food_data_url, engine="auto")
# Cleaning up food location data

# Prepend Census tracts with digits if needed ot match Census Labels
food_locs['CensusTract'] = food_locs['CensusTract'].astype(str).str.zfill(11)

food_locs['FIPS'] = food_locs['CensusTract'].str.slice(0, 5)

food_locs.head()
CensusTract State County Urban Pop2010 OHU2010 GroupQuartersFlag NUMGQTRS PCTGQTRS LILATracts_1And10 ... TractBlack TractAsian TractNHOPI TractAIAN TractOMultir TractHispanic TractHUNV TractSNAP FIPS county
0 01001020100 Alabama Autauga County 1 1912 693 0 0.0 0.000000 0 ... 217.0 14.0 0.0 14.0 45.0 44.0 6.0 102.0 01001 Autauga
1 01001020200 Alabama Autauga County 1 2170 743 0 181.0 8.341014 1 ... 1217.0 5.0 0.0 5.0 55.0 75.0 89.0 156.0 01001 Autauga
2 01001020300 Alabama Autauga County 1 3373 1256 0 0.0 0.000000 0 ... 647.0 17.0 5.0 11.0 117.0 87.0 99.0 172.0 01001 Autauga
3 01001020400 Alabama Autauga County 1 4386 1722 0 0.0 0.000000 0 ... 193.0 18.0 4.0 11.0 74.0 85.0 21.0 98.0 01001 Autauga
4 01001020500 Alabama Autauga County 1 10766 4082 0 181.0 1.681219 0 ... 1437.0 296.0 9.0 48.0 310.0 355.0 230.0 339.0 01001 Autauga

5 rows × 149 columns

food_locs['county'] = food_locs['County'].str.replace(" County", "")
df = pd.DataFrame(food_locs.groupby(["county", "FIPS"]).mean()).reset_index()
df = df.fillna(0)
df.head()
county FIPS Urban Pop2010 OHU2010 GroupQuartersFlag NUMGQTRS PCTGQTRS LILATracts_1And10 LILATracts_halfAnd10 ... TractSeniors TractWhite TractBlack TractAsian TractNHOPI TractAIAN TractOMultir TractHispanic TractHUNV TractSNAP
0 Abbeville 45001 0.166667 4236.166667 1665.000000 0.000000 150.166667 3.392506 0.333333 0.333333 ... 700.500000 2948.500000 1197.833333 12.500000 0.833333 10.000000 66.500000 42.500000 169.000000 325.833333
1 Acadia Parish 22001 0.500000 5147.750000 1903.416667 0.000000 87.500000 1.651573 0.250000 0.500000 ... 657.166667 4093.916667 931.250000 11.416667 0.500000 13.583333 97.083333 88.333333 153.750000 363.250000
2 Accomack 51001 0.000000 3684.888889 1533.111111 0.000000 47.555556 0.923669 0.111111 0.111111 ... 704.000000 2406.888889 1033.666667 20.333333 4.444444 15.000000 204.555556 316.666667 150.222222 213.333333
3 Ada 16001 0.932203 6650.254237 2516.016949 0.016949 164.644068 2.747349 0.016949 0.186441 ... 695.728814 6005.881356 75.271186 159.440678 14.593220 44.237288 350.830508 472.966102 109.000000 214.220339
4 Adair 19001 0.000000 2560.666667 1097.333333 0.000000 50.333333 1.737189 0.000000 0.000000 ... 548.000000 2519.333333 3.666667 7.333333 0.333333 1.333333 28.666667 33.666667 40.666667 123.666667

5 rows × 146 columns

County GeoJSON can be found here

with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)
# https://plotly.com/python/choropleth-maps/
fig = px.choropleth(df, geojson=counties, locations='FIPS', color='lakids10share',
                           color_continuous_scale="Inferno",
                           title="<b>USDA 2019:</b> Percent of children living beyond 10 miles from supermarket<br><sup>\
                                Some US Counties have up to 35% of children who live more than 10 miles away from a supermarket</sup>",
                           labels={"lakids10share": "% Children"},
                           scope="usa")

fig.update_traces(marker_line_width=0)
fig.show()

As we see, many US counties, particularly in the western half of the country, lack walkable access to supermarkets

# Population more than 20 miles away from supermarket
fig = px.choropleth(df, geojson=counties, locations='FIPS', color='lasnaphalfshare',
                           title="<b>Number of Americans Receiving SNAP Benefits: 2019<br><sup>\
                                With so much dependence on the program, additional funding and outreach could raise political awareness of the benefits and benefactors</sup>",
                           labels={"TractSNAP": "Number of Citizens"},
                           scope="usa")

fig.update_traces(marker_line_width=0)
fig.show()

I downloaded budget data from the US White House website. We can use this to see how the breakdown of Federal spending allocates resources to programs like SNAP which work to help feed food insecure individuals in America.

# Original read-in
# budget = pd.read_excel("/Users/andrewbowen/Downloads/hist04z2_fy2024.xlsx", header=1)

budget_path = "https://raw.githubusercontent.com/andrewbowen19/storiesDATA608/main/data/us-budget.csv"
budget = pd.read_csv(budget_path)
budget['department'] = budget['Department or other unit']

# Send to CSV locally
# budget.to_csv("data/us-budget.csv")
# Basic cleaning of budget data
budget = budget.replace('-*', np.nan).replace("..........", np.nan).replace("*", np.nan)
budget = budget.apply(pd.to_numeric, errors='coerce').fillna(budget)

# Convert to floats and only take positive values representing percentage of overall budget
budget['2022'] = budget['2022'].astype(float)
budget = budget.loc[(budget['2022'] >= 0.0) & (budget['department'] != "Total outlays")]
# Highlight Dep of Ag (including SNAP) rows
budget['is_agriculture'] = np.where(budget['department'] == "Department of Agriculture", 'r', 'b')

# Sort by percentage of budget allocated
budget = budget[['department', '2022', 'is_agriculture']].sort_values(by="2022", ascending=True).set_index("department")
# Plotting budget by department
ag_title = "<b>US Government Budget by Department</b><br><sup>The department of Agriculture (which contains the SNAP program) receives only 3.9% of our overall budget.<br>Are we allocating resources adequately to feed americans?</sup>"

fig = px.bar(budget,
       x='2022', orientation='h',
       title=ag_title,
       width=1200, height=800,
       labels={
                "department": "US Government Department",
                "2022": "Percentage of 2022 Federal Budget Overlays"
                 },
       color='is_agriculture', category_orders={'department': budget.index[::-1]}
)


fig.show()